################################################################################
# Code for the Monte Carlo Analysis
# Estimating Onsets of Binary Events in Panel Data
# Liam F. McGrath
#
# Purpose: This code is used to conduct the monte carlo analyses displayed in
#          the paper. Its main output is the .csv files collected_props.csv
#          and scenarios.csv which store the output of the monte carlo runs
#          that are analysed in the script mc_analysis.R

rm(list = ls())

library(MASS)
library(parallel)

# IMPORTANT:
# Define the path which all the code, data and results are located within
# From this changes in working directory are formed by pasting this
# string with a string for the relevant subfolder
# If using this code be sure to change to this to your own directory

dir <- "/Users/liammcgrath/Documents/Academic/Research/transition/replication_materials/"

# Function for calculating time since last event
# Code taken from DAMisc package (David Armstrong's Miscellaneous Functions)

btscs <-
  function (data, event, tvar, csunit, pad.ts = FALSE) 
  {
    data$orig_order <- 1:nrow(data)
    data <- data[order(data[[csunit]], data[[tvar]]), ]
    spells <- function(x) {
      tmp <- rep(0, length(x))
      runcount <- 0
      for (j in 2:length(x)) {
        if (x[j] == 0 & x[(j - 1)] == 0) {
          tmp[j] <- runcount <- runcount + 1
        }
        if (x[j] != 0 & x[(j - 1)] == 0) {
          tmp[j] <- runcount + 1
          runcount <- 0
        }
        if (x[j] == 0 & x[(j - 1)] != 0) {
          tmp[j] <- runcount <- 0
        }
      }
      tmp
    }
    sp <- split(data, data[[csunit]])
    if (pad.ts) {
      sp <- lapply(sp, function(x) x[match(seq(min(x[[tvar]], 
                                                   na.rm = T), max(x[[tvar]], na.rm = T)), x[[tvar]]), 
                                     ])
      for (i in 1:length(sp)) {
        if (any(is.na(sp[[i]][[event]]))) {
          sp[[i]][[event]][which(is.na(sp[[i]][[event]]))] <- 1
        }
        if (any(is.na(sp[[i]][[tvar]]))) {
          sp[[i]][[tvar]] <- seq(min(sp[[i]][[tvar]], na.rm = T), 
                                 max(sp[[i]][[tvar]], na.rm = T))
        }
        if (any(is.na(sp[[i]][[csunit]]))) {
          sp[[i]][[csunit]][which(is.na(sp[[i]][[csunit]]))] <- mean(sp[[i]][[csunit]], 
                                                                     na.rm = T)
        }
      }
    }
    sp <- lapply(1:length(sp), function(x) {
      cbind(sp[[x]], data.frame(spell = spells(sp[[x]][[event]])))
    })
    data <- do.call(rbind, sp)
    if (!pad.ts) {
      if (any(is.na(data$orig_order))) {
        data <- data[-which(is.na(data$orig_order)), ]
      }
      data <- data[data$orig_order, ]
    }
    else {
      data <- data[order(data[[csunit]], data[[tvar]]), ]
    }
    invisible(data)
  }



# Definition of DGP function for the monte carlo
# Arguments:
# units, time, a, b, g , d
#
# Equation: 
#
#   y_t* = a + b*x + d*y_lag + g*x*y_lag 
#

mc_trans <- function(units, time, a, b, g, d) {
  
  ############################
  # START OF DATA GENERATION #
  ############################
  
  
  # First create the data output matrix
  # "unit", "time", "y", "y_lag", "x"
  data <- matrix(NA, nrow = (units * time), ncol = 5)
  
  # Indicator var
  
  indic <- 1
  
  # Create data for x
  
  data[,5] <- rnorm((units*time), 0, 1)
  
  
  # For convenience in data generation, now create vector for x1
  
  x <- data[, 5]
  
  # Now create the values of y and y_lag, based off of this
  
  for (i in 1:units) {
    
    for (j in 1:time) {
      
      
      if (j == 1) {
        
        # For first time period, we generate it using the
        # pr(war_t = 1| war_t-1 = 0) equation
        
        data[indic, 3] <- ifelse((a + b*x[indic] +  
                                    rlogis(1,0,1)) > 0, 
                                 1, 0)
        data[indic, 1] <- i
        data[indic, 2] <- j
        
        # Put in lag of y
        # In first time period, this is assumed 0
        
        data[indic, 4] <- 0
        
        indic <- indic + 1
        
      } else {
        
        # Put in lag of y
        
        data[indic, 4] <- data[(indic - 1), 3]
        
        # Generate y
        
        data[indic, 3] <- ifelse((a + b*x[indic] + 
                                    d*data[indic,4] + 
                                    g*x[indic]*data[indic,4] + 
                                    rlogis(1,0,1)) > 0, 
                                 1, 0)
        
        # Store country and year
        data[indic, 1] <- i
        data[indic, 2] <- j
        
        
        indic <- indic + 1
        
      }
    }        
  }
  
  data <- as.data.frame(data)
  
  colnames(data) <- c("unit", "time", "y", "y_lag", "x")
  
  # Create the variable where ongoing years of war are
  # set to a 0
  data$y_trans <- ifelse(data$y_lag == 1, 0, data$y)
  
  # Create objects for mean of y and mean of y_trans
  # to be reported later
  
  mean_y <- mean(data$y)
  mean_y_trans <- mean(data$y_trans)
  
  # Create objects summarising frequencies that are important for bias
  sum_ongoing <- sum(data$y * data$y_lag)
  sum_ending <- sum((1 - data$y)*data$y_lag)
  
  
  ###########################
  # END OF DATA GENERATION  #  
  ###########################
  
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  
  ###############################
  # START OF ESTIMATION SECTION #
  ###############################
  
  # Models: - estimate the logit with the transformed y
  #            (have to include duration dependence, in 2
  #              ways: 1, based off orig var
  #                    2, based off transformed var)
  #            - model3: estimate the markov transition on untransformed y
  #            - model4: estimate transformed y with lag of untransformed y 
  
  
  # Data with t , calculated off original y
  data_orig_t <- btscs(data, event = "y", tvar = "time",
                       csunit = "unit")
  # Create polynomial
  data_orig_t$spell2 <- (data_orig_t$spell)^2
  data_orig_t$spell3 <- (data_orig_t$spell)^3
  
  
  # Data with t, calculated off transformed y
  data_trans_t <- btscs(data, event = "y_trans", 
                        tvar = "time", csunit = "unit")
  # Create polynomial
  data_trans_t$spell2 <- (data_trans_t$spell)^2
  data_trans_t$spell3 <- (data_trans_t$spell)^3
  
  
  # Estimation:
  
  # Transformed y, temporal based on orig
  out_trans_t_o <- glm(y_trans ~ x + spell + spell2
                       + spell3, 
                       family = binomial(link = "logit"),
                       data = data_orig_t)
  
  
  # Transformed y, temporal based on transformed
  out_trans_t_t <- glm(y_trans ~ x + spell + spell2
                       + spell3, 
                       family = binomial(link = "logit"),
                       data = data_trans_t)
  
  # Fearon model
  out_fearon <- glm(y_trans ~ x + y_lag + spell + spell2 + spell3,
                    family = binomial(link = "logit"),
                    data = data_orig_t)
  
  # Markov transition
  
  out_orig <- glm(y ~ x + y_lag + I(x*y_lag),
                  family = binomial(link = "logit"),
                  data = data_orig_t)  
  
  
  #####################
  # END OF ESTIMATION #
  #####################
  
  
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
  
  #################################
  # BEGIN STORAGE OF ESTIMATES    #
  #################################
  
  # Store the coefficients
  # As well as their associated standard errors
  
  # Model 1: Transformed y, original y for t
  
  b_model1 <- as.numeric(out_trans_t_o$coefficients[2])
  b_se_model1 <- sqrt(vcov(out_trans_t_o)[2, 2])
  
  # Model 2: Transformed y, transformed y for t
  
  b_model2 <- as.numeric(out_trans_t_t$coefficients[2])
  b_se_model2 <- sqrt(vcov(out_trans_t_t)[2, 2])
  
  # Model 3: Markov transition
  
  b_model3 <- as.numeric(out_orig$coefficients[2])
  b_se_model3 <- sqrt(vcov(out_orig)[2, 2])
  
  # Model 4: Markov transition
  
  b_model4 <- as.numeric(out_fearon$coefficients[2])
  b_se_model4 <- sqrt(vcov(out_fearon)[2, 2])
  
  ############################
  # END STORAGE OF ESTIMATES #
  ############################
  
  
  
  ######################
  # OUTPUT THE RESULTS #
  ######################
  
  return(cbind(b_model1, b_model2, b_model3,
               b_se_model1, b_se_model2, b_se_model3, 
               mean_y, mean_y_trans, sum_ongoing, sum_ending,
               b_model4, b_se_model4)
  )
  
  
  
  ##################
  ##################
  ## END FUNCTION ##
  ##################
  ##################
  
}



#### Code to run the Monte Carlo
setwd(paste0(dir, "output/"))

set.seed(123) # set seed for replication purposes

n <- 1000   # number of simulations for each scenario

units <- 100 # number of units
time <- 40   # number of time periods
a <- seq(-5,-2, 1)  # intercept when y_t-1 = 0
d <- seq(2,6,1)     # change in intercept when y_t-1 = 1
b <- seq(-2.5, 2.5,0.5) # effect of x when y_t-1 = 0 (i.e. effect on onsets)
g <- seq(-2.5, 2.5,0.5) # change in effect of x when y_t-1 = 1

# scenarios defined by all possible combinations of parameters
scen <- expand.grid(units = units, time = time, a = a, b = b, g = g, d = d)

# Save details of scenarios
write.csv(scen, "scenarios.csv")

# Loop over scenarios and apply esmates function
# then save the histograms of interest.
# Note parallel processing is used with 8 cores
# Be sure to change the mc.cores argument in mcmapply to
# your own number of available cores if you wish to run yourself

for (i in 1:nrow(scen)) {
  
  cat("Scenario Starting: ", i, "\n")
  start_time <- proc.time()
  
  # Set up reps for current scenario
  cur_scen <- scen[i, ]
  
  cur_scen <- matrix(rep(cur_scen, n), nrow = n, ncol = 6, byrow = T)
  
  cur_scen <- as.data.frame(cur_scen)
  
  colnames(cur_scen) <- c("units", "time", "a", "b", "g", "d")
  
  # Now get estimates
  cur_estimates <- mcmapply(mc_trans, units = cur_scen$units, 
                            time = cur_scen$time, a = cur_scen$a,
                            b = cur_scen$b, g = cur_scen$g,
                            d = cur_scen$d,
                            mc.cores = 8 # change to num of cores allowed
  )
  
  cur_estimates <- t(cur_estimates)
  
  cur_final <- as.data.frame(cbind(cur_scen, cur_estimates))
  
  colnames(cur_final) <- c("units", "time", "a", "b", "g", "d",
                           "b_model1", "b_model2", "b_model3",
                           "b_se_model1", "b_se_model2", "b_se_model3", 
                           "mean_y", "mean_y_trans", 
                           "sum_ongoing", "sum_ending",
                           "b_model4", "b_se_model4"
  )
  
  
  ## Confidence intervals
  
  ci_b_model1 <- cbind(cur_final$b_model1 - 1.96*cur_final$b_se_model1,
                       cur_final$b_model1 + 1.96*cur_final$b_se_model1
  )
  
  ci_b_model2 <- cbind(cur_final$b_model2 - 1.96*cur_final$b_se_model2,
                       cur_final$b_model2 + 1.96*cur_final$b_se_model2
  )
  
  ci_b_model3 <- cbind(cur_final$b_model3 - 1.96*cur_final$b_se_model3,
                       cur_final$b_model3 + 1.96*cur_final$b_se_model3
  )
  
  ci_b_model4 <- cbind(cur_final$b_model4 - 1.96*cur_final$b_se_model4,
                       cur_final$b_model4 + 1.96*cur_final$b_se_model4
  )
  
  ## Indicator for whether the CI contains the true value
  
  prop_b_model1 <- ifelse((ci_b_model1[, 1] <= cur_final$b[[1]]) &
                            (ci_b_model1[, 2] >= cur_final$b[[1]]),
                          1, 0)
  
  prop_b_model2 <- ifelse((ci_b_model2[, 1] <= cur_final$b[[1]]) &
                            (ci_b_model2[, 2] >= cur_final$b[[1]]),
                          1, 0)
  
  prop_b_model3 <- ifelse((ci_b_model3[, 1] <= cur_final$b[[1]]) &
                            (ci_b_model3[, 2] >= cur_final$b[[1]]),
                          1, 0)
  
  prop_b_model4 <- ifelse((ci_b_model4[, 1] <= cur_final$b[[1]]) &
                            (ci_b_model4[, 2] >= cur_final$b[[1]]),
                          1, 0)
  
  # RMSE of coefficients
  
  rmse_b_model1 <- sqrt(mean((cur_final$b[[1]] 
                              - cur_final$b_model1)^2)
  )
  
  
  rmse_b_model2 <- sqrt(mean((cur_final$b[[1]] 
                              - cur_final$b_model2)^2)
  )
  
  
  rmse_b_model3 <- sqrt(mean((cur_final$b[[1]] 
                              - cur_final$b_model3)^2)
  )
  
  rmse_b_model4 <- sqrt(mean((cur_final$b[[1]] 
                              - cur_final$b_model4)^2)
  )
  
  # Average estimate for b
  mean_b_model1 <- mean(cur_final$b_model1)
  mean_b_model2 <- mean(cur_final$b_model2)
  mean_b_model3 <- mean(cur_final$b_model3)
  mean_b_model4 <- mean(cur_final$b_model4)
  
  
  
  # Save a .csv table of the results for this scenario
  
  props <- rbind(c("b", "g", "ci_prop_model_1", "ci_prop_model_2", 
                   "ci_prop_model_3", 
                   "rmse_b_model1", "rmse_b_model2",
                   "rmse_b_model3", "mean_y", "mean_y_trans",
                   "mean_b_model1", "mean_b_model2",
                   "mean_b_model3", "sum_ongoing", "sum_ending",
                   "ci_prop_model_4", "rmse_b_model4", "mean_b_model4"), 
                 c(cur_final$b[1], cur_final$g[1],
                   mean(prop_b_model1), mean(prop_b_model2),
                   mean(prop_b_model3),  rmse_b_model1,
                   rmse_b_model2, rmse_b_model3,
                   cur_final$mean_y[1], cur_final$mean_y_trans[1],
                   mean_b_model1, mean_b_model2,
                   mean_b_model3, cur_final$sum_ongoing[1], 
                   cur_final$sum_ending[1],
                   mean(prop_b_model4),  rmse_b_model4, mean_b_model4)       
  )
  
  
  write.csv(props, paste("ci_props_scenario", i, ".csv", sep = ""),
            row.names = F)
  
  
  cat("Scenario Finished: ", i, "\n Time Taken: ")
  print(proc.time() - start_time)
  
}

# We now have .csv files summarising the results of each scenario
# Now combine them all together into one data frame

data <- read.csv("ci_props_scenario1.csv")[2,]

for (i in 2:nrow(scen)) {
  
  curr <- read.csv(paste("ci_props_scenario", i,".csv", sep = ""))[2,]
  data <- rbind(data, curr)
  rm(curr)
}

colnames(data) <- c("b", "g", "ci_prop_model_1", "ci_prop_model_2", 
                    "ci_prop_model_3", "rmse_b_model1", "rmse_b_model2",
                    "rmse_b_model3", "mean_y", "mean_y_trans", "mean_b_model1",
                    "mean_b_model2", "mean_b_model3", "sum_ongoing",
                    "sum_ending", "ci_prop_model4", "rmse_b_model4",
                    "mean_b_model4"
)

# Save this data frame as a .csv file, to be used with mc_analysis.R
# This is how collected_props.csv in the output subfolder was created
write.csv(data, "collected_props.csv" , row.names = FALSE)

